In [1]:
import os
import pandas as pd
import numpy as np
import AGEpy as age
from biomart import BiomartServer
In [2]:
folder = "/nexus/posix0/MAGE-flaski/service/projects/data/Adam_Antebi/AA_spliceosomics/"
anonymous = folder + "anonymous_aging_hits.xlsx"
df_anonymous=pd.read_excel(anonymous)
In [3]:
df_anonymous.head()
Out[3]:
GeneNames | gene_id | Class | chr_id | start | stop | strand | type | position | sites | ... | log2FC_wt_D0_vs_wt_D7 | pvalue_wt_D14_vs_wt_D21 | padj_wt_D14_vs_wt_D21 | log2FC_wt_D14_vs_wt_D21 | pvalue_wt_D7_vs_wt_D14 | padj_wt_D7_vs_wt_D14 | log2FC_wt_D7_vs_wt_D14 | pvalue_wt_D7_vs_wt_D21 | padj_wt_D7_vs_wt_D21 | log2FC_wt_D7_vs_wt_D21 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | WBGene00002978 | sajrG002946 | c | I | 14622304 | 14622511 | -1 | ALT | INTERNAL | ad | ... | 1.597883 | 0.644669 | 1 | 0.049143 | 6.353030e-20 | 2.243961e-16 | 1.013759 | 6.329868e-29 | 3.999084e-25 | 1.062902 |
1 | WBGene00003738 | sajrG039325 | = | V | 12915313 | 12915555 | 1 | ALT | INTERNAL | ad | ... | 0.998421 | 0.269557 | 1 | 0.321127 | 2.600493e-09 | 1.198073e-06 | 1.491427 | 2.261997e-09 | 4.438150e-07 | 1.812554 |
2 | WBGene00005007 | sajrG028932 | = | IV | 8892046 | 8892048 | 1 | ALT | INTERNAL | aa | ... | 0.490385 | 0.448171 | 1 | 0.068356 | 2.050674e-05 | 2.678814e-03 | 0.294061 | 1.764764e-05 | 1.376472e-03 | 0.362417 |
3 | WBGene00006759 | sajrG030873 | = | IV | 11994770 | 11995108 | -1 | ALT | INTERNAL | ad | ... | 0.768802 | 0.769264 | 1 | 0.099608 | 2.799405e-06 | 5.088198e-04 | 0.812186 | 1.119455e-06 | 1.211044e-04 | 0.911795 |
4 | WBGene00006780 | sajrG028599 | c | IV | 5966653 | 5966763 | 1 | ALT | FIRST | sd | ... | 1.029296 | 0.824274 | 1 | 0.027321 | 4.292683e-11 | 2.966524e-08 | 1.025532 | 2.593437e-15 | 1.780958e-12 | 1.052853 |
5 rows × 31 columns
In [4]:
biomarthost="http://jul2023.archive.ensembl.org/biomart/"
python_output=folder
species="caenorhabditis elegans"
taxons={"caenorhabditis elegans":"6239","drosophila melanogaster":"7227",\
"mus musculus":"10090","homo sapiens":"9606", "saccharomyces cerevisiae": "4932", "nothobranchius furzeri": "105023"}
tags={"caenorhabditis elegans":"CEL","drosophila melanogaster":"DMEL",\
"mus musculus":"MUS","homo sapiens":"HSA"}
taxon_id=taxons[species]
# aging_genes = []
# ### ATTENTION ### if you are using yeast, you will need to uncomment the follwing lines
# if species in tags.keys():
# organismtag=tags[species]
# if not os.path.isfile(python_output+"/homdf.txt"):
# print("Could not find ageing evidence table. Using biomart to create one.")
# sys.stdout.flush()
# homdf,HSA,MUS,CEL,DMEL=age.FilterGOstring(host=biomarthost)
# homdf.to_csv(python_output+"/homdf.txt", index=None,sep="\t")
# else:
# print("Found existing ageing evidence table.")
# sys.stdout.flush()
# homdf=pd.read_csv(python_output+"/homdf.txt", sep="\t")
# aging_genes=homdf[[organismtag+"_ensembl_gene_id","evidence"]].dropna()
# aging_genes=aging_genes[aging_genes[organismtag+"_ensembl_gene_id"]!="None"]
# aging_genes=aging_genes[organismtag+"_ensembl_gene_id"].tolist()
In [5]:
server = BiomartServer('http://dec2021.archive.ensembl.org/biomart')
celegans_dataset="celegans_gene_ensembl"
celegans_dataset=server.datasets[celegans_dataset]
celegans_dataset
Out[5]:
Caenorhabditis elegans genes (WBcel235)
In [6]:
attributes = ['ensembl_gene_id','external_gene_name','hsapiens_homolog_ensembl_gene','hsapiens_homolog_associated_gene_name','hsapiens_homolog_orthology_confidence']
response=celegans_dataset.search({'attributes': attributes})
response=response.content.decode().split("\n")
response=[s.split("\t") for s in response ]
response=pd.DataFrame(response, columns=attributes)
response.head()
Out[6]:
ensembl_gene_id | external_gene_name | hsapiens_homolog_ensembl_gene | hsapiens_homolog_associated_gene_name | hsapiens_homolog_orthology_confidence | |
---|---|---|---|---|---|
0 | WBGene00000001 | aap-1 | ENSG00000278139 | P3R3URF-PIK3R3 | 0 |
1 | WBGene00000001 | aap-1 | ENSG00000117461 | PIK3R3 | 0 |
2 | WBGene00000001 | aap-1 | ENSG00000268173 | 0 | |
3 | WBGene00000001 | aap-1 | ENSG00000105647 | PIK3R2 | 0 |
4 | WBGene00000002 | aat-1 | ENSG00000092068 | SLC7A8 | 0 |
In [11]:
# # aging genes
def check_aging_genes(gene_names, genes_of_interest):
gene_list = gene_names.split(',')
return 'yes' if any(gene in genes_of_interest for gene in gene_list) else 'no'
# # Apply the function to create a new column 'AgingGene'
# df_anonymous['AgingGene'] = df_anonymous['GeneNames'].apply(check_aging_genes, genes_of_interest=aging_genes)
# df_anonymous
In [8]:
# df_Yidong['AgingGene'] = df_Yidong['GeneNames'].apply(check_aging_genes, genes_of_interest=aging_genes)
# df_Yidong
In [9]:
# homologs
gene_names_expanded = df_anonymous['GeneNames'].str.split(',').explode()
response_subset = response[response['ensembl_gene_id'].isin(gene_names_expanded)]
response_subset_hs = response_subset[(response_subset['hsapiens_homolog_ensembl_gene'].notna()) & (response_subset['hsapiens_homolog_ensembl_gene'] != '')]
response_subset_hs_ids = response_subset_hs['ensembl_gene_id'].unique()
In [12]:
df_anonymous['Human_homologs'] = df_anonymous['GeneNames'].apply(check_aging_genes, genes_of_interest=response_subset_hs_ids)
df_anonymous.to_excel(folder + "anonymous_aging_hits_annotated.xlsx", index=False)
df_anonymous.head()
Out[12]:
GeneNames | gene_id | Class | chr_id | start | stop | strand | type | position | sites | ... | pvalue_wt_D14_vs_wt_D21 | padj_wt_D14_vs_wt_D21 | log2FC_wt_D14_vs_wt_D21 | pvalue_wt_D7_vs_wt_D14 | padj_wt_D7_vs_wt_D14 | log2FC_wt_D7_vs_wt_D14 | pvalue_wt_D7_vs_wt_D21 | padj_wt_D7_vs_wt_D21 | log2FC_wt_D7_vs_wt_D21 | Human_homologs | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | WBGene00002978 | sajrG002946 | c | I | 14622304 | 14622511 | -1 | ALT | INTERNAL | ad | ... | 0.644669 | 1 | 0.049143 | 6.353030e-20 | 2.243961e-16 | 1.013759 | 6.329868e-29 | 3.999084e-25 | 1.062902 | yes |
1 | WBGene00003738 | sajrG039325 | = | V | 12915313 | 12915555 | 1 | ALT | INTERNAL | ad | ... | 0.269557 | 1 | 0.321127 | 2.600493e-09 | 1.198073e-06 | 1.491427 | 2.261997e-09 | 4.438150e-07 | 1.812554 | no |
2 | WBGene00005007 | sajrG028932 | = | IV | 8892046 | 8892048 | 1 | ALT | INTERNAL | aa | ... | 0.448171 | 1 | 0.068356 | 2.050674e-05 | 2.678814e-03 | 0.294061 | 1.764764e-05 | 1.376472e-03 | 0.362417 | yes |
3 | WBGene00006759 | sajrG030873 | = | IV | 11994770 | 11995108 | -1 | ALT | INTERNAL | ad | ... | 0.769264 | 1 | 0.099608 | 2.799405e-06 | 5.088198e-04 | 0.812186 | 1.119455e-06 | 1.211044e-04 | 0.911795 | yes |
4 | WBGene00006780 | sajrG028599 | c | IV | 5966653 | 5966763 | 1 | ALT | FIRST | sd | ... | 0.824274 | 1 | 0.027321 | 4.292683e-11 | 2.966524e-08 | 1.025532 | 2.593437e-15 | 1.780958e-12 | 1.052853 | yes |
5 rows × 32 columns
In [ ]:
In [ ]: